Investigate whether regressing out potential confounders could alter data preprocessing¶

After collecting the raw counts, try regressing out some factors. Follows this script: 0_dataset_formatting_invivo_counts.ipynb

Conclusion: regressing out the general covariates does not seem to lead to differences in conclusions. I will proceed with the version without regressing out.

QC¶

In [1]:
import scanpy as sc
import anndata as ad
import os
import datetime
import pandas as pd
import numpy as np
seed=1
import matplotlib.pyplot as plt
# from pybiomart import Dataset
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['pdf.fonttype'] = 42
In [2]:
gene_list_1 = ['Olig2','Sox10','Olig1','Pdgfra','Cspg4','Myrf']
gene_list_2 = ['Olig2','Sox10','Pdgfra','Cspg4','Ascl1','Plp1','Opalin','Myrf','Mbp','Pou3f1','Mpz','Egr2','Prx','Pmp22']
gene_list_3 = ['Pou3f2','Src','Erbb3','Ntrk2','Itga1','Dag1','Ptn']
myelin_genes = ["Mag", "Plp1", "Pou3f1", "Egr2", "Prx", "Mpz", "Pmp22", "Ncmap"]
nerve_support = ["Plat", "Gap43", "Erbb3", "Gdnf", "Sostdc1", "Bdnf", "Fabp7"]
sc_genes = ["Mbp", "Ngfr", "Nfatc4", "Mpz", "Mog", "Gfap", "Ntf3", "Aqp4", "Tgfb1",
               "Pmp22", "Pou3f1", "Sox10"]
relevant_genes = gene_list_1+gene_list_2+gene_list_3+myelin_genes+nerve_support+sc_genes
relevant_genes = list(set(relevant_genes))
In [3]:
len(relevant_genes)
Out[3]:
37
In [4]:
adata = sc.read_h5ad("dt_out/0_preprocessing/invivo_raw_counts_combined_fixed_varname_filtered_beforeQC.h5ad")
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")

quick filter based on mt and total counts and genes

In [5]:
adata.obs["outlier_mt"] = adata.obs.pct_counts_mt > 22
adata.obs["outlier_total"] = adata.obs.total_counts > 0.7*1e7
adata.obs["outlier_ngenes"] = adata.obs.n_genes_by_counts < 3000
adata.obs["outlier_ERCC"] = adata.obs.pct_counts_ERCC > 20

print(
    "%u cells with high %% of mitochondrial genes"
    % (sum(adata.obs["outlier_mt"]))
)
print("%u cells with large total counts" % (sum(adata.obs["outlier_total"])))
print("%u cells with large number of genes" % (sum(adata.obs["outlier_ngenes"])))
print("%u cells with high pct_counts_ERCC" % (sum(adata.obs["outlier_ERCC"])))

adata = adata[~adata.obs["outlier_mt"], :]
adata = adata[~adata.obs["outlier_total"], :]
adata = adata[~adata.obs["outlier_ngenes"], :]
adata = adata[~adata.obs["outlier_ERCC"], :]
19 cells with high % of mitochondrial genes
11 cells with large total counts
14 cells with large number of genes
9 cells with high pct_counts_ERCC
In [6]:
adata.write_h5ad("dt_out/0_preprocessing/invivo_raw_counts_combined_fixed_varname_filtered_postBasicQC.h5ad")
In [7]:
adata = sc.read_h5ad("dt_out/0_preprocessing/invivo_raw_counts_combined_fixed_varname_filtered_postBasicQC.h5ad")
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")

Normalisation & HVG¶

In [8]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata, base=2)
In [9]:
adata.obs['Batch'] = adata.obs['Batch'].astype(str)
adata.obs['Animal'] = adata.obs['Animal'].astype(str)
In [10]:
adata.obs['Batch_Animal'] = adata.obs['Batch'] + adata.obs['Animal']
In [11]:
sc.pp.highly_variable_genes(adata, n_top_genes=3000, batch_key="Batch_Animal", subset=False)
In [12]:
np.sum(adata.var['highly_variable'])
Out[12]:
3007
In [13]:
hvg_selected = list(adata.var['highly_variable'][adata.var['highly_variable']].index.values) + relevant_genes
hvg_selected = list(set(hvg_selected))
hvg_selected = [x for x in hvg_selected if x.startswith("ERCC")==False]
# 
In [14]:
adata.var['highly_variable_modified'] = adata.var['highly_variable'].index.isin(hvg_selected)
In [15]:
adata.var['highly_variable_original'] = adata.var['highly_variable']
In [16]:
adata.var['highly_variable'] = adata.var['highly_variable_modified']
In [17]:
adata.write_h5ad("dt_out/0_preprocessing/invivo_raw_counts_combined_fixed_varname_filtered_postBasicQC_preHVG.h5ad")
In [18]:
sc.pl.highly_variable_genes(adata)
No description has been provided for this image
In [19]:
np.sum(adata.var.highly_variable)
Out[19]:
3000
In [20]:
sc.pp.regress_out(adata, ["total_counts", "pct_counts_mt","pct_counts_ERCC"],n_jobs=5)
In [21]:
#Ensure there is no negative value
# adata.X = adata.X + np.min(adata.X)*-1
In [22]:
sc.tl.pca(adata,use_highly_variable=True) 
In [23]:
sc.pl.pca(
    adata,
    color=["sample", "Animal", "Condition", "Batch", "Mpz"],
    ncols=2,
)
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [24]:
sc.pl.pca(adata, color = ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo',"pct_counts_ERCC"])
No description has been provided for this image
In [25]:
sc.pp.neighbors(adata, 50, random_state=seed)
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
In [26]:
sc.tl.umap(adata,random_state=seed)
In [27]:
sc.pl.umap(
    adata,
    color=["sample", "Animal", "Condition", "Batch"],
    ncols=2,
)
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [28]:
sc.pl.umap(
    adata,
    color=["Mpz"],
    ncols=2,
)
No description has been provided for this image
In [29]:
sc.tl.leiden(adata, n_iterations=2)
In [30]:
sc.tl.leiden(adata, resolution=0.5, key_added='leiden_0.5',n_iterations=2)
In [31]:
sc.pl.umap(adata, color=["leiden",'leiden_0.5'])
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [32]:
sc.pl.umap(
    adata,
    color=["leiden", "Batch", "pct_counts_mt", "Condition"],
    wspace=0.5,
    ncols=2,
)
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [33]:
adata.write_h5ad("dt_out/0_preprocessing/invivo_raw_counts_combined_fixed_varname_filtered_afterQC_umap_v2_with_regressedout.h5ad")
In [34]:
sc.pl.umap(
    adata,
    color=["Mpz",'Sox10','Olig2'],
    wspace=0.5,
    ncols=2,
)
No description has been provided for this image
In [35]:
sc.tl.leiden(adata, resolution=0.1, key_added='leiden_0.1',n_iterations=3)
In [36]:
sc.tl.leiden(adata, resolution=0.15, key_added='leiden_0.15',n_iterations=3)
In [37]:
sc.tl.leiden(adata, resolution=0.2, key_added='leiden_0.2',n_iterations=3)
In [38]:
sc.tl.leiden(adata, resolution=0.3, key_added='leiden_0.3',n_iterations=3)
In [39]:
sc.tl.leiden(adata, resolution=0.4, key_added='leiden_0.4',n_iterations=3)
In [42]:
adata.var
Out[42]:
gene_symbol gene_name type gene_symbol_with_control mt ribo ERCC n_cells_by_counts mean_counts pct_dropout_by_counts total_counts n_cells highly_variable means dispersions dispersions_norm highly_variable_nbatches highly_variable_intersection highly_variable_modified highly_variable_original
gene_name
Arsj ENSRNOG00000000001 Arsj Gene Expression ENSRNOG00000000001 False False False 29 2.477113 94.894366 1407.0 29 False 0.888820 3.175555 0.427470 2 False False False
Gad1 ENSRNOG00000000007 Gad1 Gene Expression ENSRNOG00000000007 False False False 16 0.029930 97.183099 17.0 16 False 0.196786 1.542631 0.322993 2 False False False
Cbln1 ENSRNOG00000000010 Cbln1 Gene Expression ENSRNOG00000000010 False False False 15 1.239437 97.359155 704.0 15 True 0.621525 2.368965 0.449033 3 False True True
Tcf15 ENSRNOG00000000012 Tcf15 Gene Expression ENSRNOG00000000012 False False False 8 0.021127 98.591549 12.0 8 False 0.054413 0.354737 -0.605348 1 False False False
Steap1 ENSRNOG00000000017 Steap1 Gene Expression ENSRNOG00000000017 False False False 13 0.878521 97.711268 499.0 13 False 0.400089 1.871175 0.297288 2 False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00162 NaN ERCC-00162 NaN ERCC-00162 False False True 54 4.146127 90.492958 2355.0 54 False 1.482031 3.854193 0.415968 2 False False False
ERCC-00163 NaN ERCC-00163 NaN ERCC-00163 False False True 23 0.338028 95.950704 192.0 23 False 0.194360 1.479363 0.275842 2 False False False
ERCC-00165 NaN ERCC-00165 NaN ERCC-00165 False False True 31 3.649648 94.542254 2073.0 31 False 0.978743 3.020990 0.340308 1 False False False
ERCC-00170 NaN ERCC-00170 NaN ERCC-00170 False False True 8 0.547535 98.591549 311.0 8 False 0.420748 1.845492 0.214822 1 False False False
ERCC-00171 NaN ERCC-00171 NaN ERCC-00171 False False True 505 80.102112 11.091549 45498.0 505 False 5.271231 7.313677 1.417613 4 False False True

19459 rows × 20 columns

In [43]:
adata.var_names = adata.var['gene_name'].astype(str)
adata.var_names_make_unique()
In [44]:
sc.pl.dotplot(adata, ["Mpz",'Sox10','Olig2'], groupby='Condition')
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)
No description has been provided for this image
In [45]:
sc.pl.pca(
    adata,
    color=["pct_counts_ribo","total_counts","Condition"],
    wspace=0.5,
    ncols=2,
)
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [46]:
sc.pl.umap(adata, color=["Batch", "Condition","leiden_0.1",'leiden_0.2','leiden_0.3','leiden_0.4','leiden_0.5','Mpz',"Opalin"])
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [47]:
sc.pl.pca(adata, color=["Batch", "Condition","leiden_0.15"])
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [48]:
sc.pl.umap(adata, color=["Batch", "Condition","leiden_0.15"])
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [49]:
adata.write_h5ad("dt_out/0_preprocessing/invivo_combined_post_manual_qc_hvg_norm_log1p_leiden_with_regressed_out.h5ad")
In [50]:
adata = sc.read_h5ad("dt_out/0_preprocessing/invivo_combined_post_manual_qc_hvg_norm_log1p_leiden_with_regressed_out.h5ad")
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
In [51]:
sc.pl.umap(adata, color=["Batch", "Condition","leiden_0.5",'leiden_0.2','leiden_0.3','leiden_0.4','Mpz'])
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/Users/yy8/miniconda/envs/sc/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [52]:
sc.pl.umap(adata, color=gene_list_1,cmap = "magma", save = "umap_markers_invivo_v2.pdf")
WARNING: saving figure to file figures/umapumap_markers_invivo_v2.pdf
No description has been provided for this image
In [53]:
gene_list_3 = ['Pou3f1','Src','Erbb3','Ntrk2','Itga1','Dag1','Ptn']
In [54]:
myelin_genes = ["Mag", "Plp1", "Pou3f1", "Egr2", "Prx", "Mpz", "Pmp22"]
nerve_support = ["Plat", "Gap43", "Erbb3", "Sostdc1", "Bdnf", "Fabp7"]
sc_genes = ["Mbp", "Ngfr", "Nfatc4", "Mpz", "Mog", "Gfap", "Aqp4", "Tgfb1",
               "Pmp22", "Pou3f1", "Sox10"]
In [ ]:
 
In [ ]:
 
In [ ]: